{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "This file is for replication Table4.\n",
    "You need invent_location, NBER_class, origin_10per_rand, and all control files\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#rounding\n",
    "r = 2 \n",
    "#California\n",
    "NCA = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "starting_year = [1976, 1986, 1996, 2006]\n",
    "end_year = [1985, 1995, 2005, 2015]\n",
    "\n",
    "LOC = pd.read_csv('invent_location.csv', sep='*')\n",
    "Upper = pd.read_csv('origin_10per_rand.csv', sep='*')\n",
    "NBER_class = pd.read_csv('NBER_class.csv', sep='\\t')\n",
    "\n",
    "\"\"\"Set control criteria name and a rounding decision\"\"\"\n",
    "Z = ['3-digit','Any','Primary','Common']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table = pd.DataFrame()\n",
    "\n",
    "for x, y in zip(starting_year, end_year):\n",
    "    result = pd.DataFrame()\n",
    "    \n",
    "    p_hat_citing_list = []\n",
    "    SE_hat_citing_list = []\n",
    "    \n",
    "    for z in [1,2,3,4]:\n",
    "        \n",
    "        file_in = 'controls' + str(z) + '_' + str(x) + '_' + str(y) +'.csv'\n",
    "        www = pd.read_csv(file_in, sep=','); www = www[['cited','citing','citing_control','period']]\n",
    "        www = pd.merge(www,Upper,left_on='cited',right_on='wku',how='inner'); www = www.drop('wku', axis=1)\n",
    "        www = pd.merge(www,LOC,left_on='cited',right_on='wku',how='inner'); www = www.drop('wku', axis=1)\n",
    "        www.columns = ['cited','citing','control','period','cnt_cited','sta_cited','cmsa_cited']\n",
    "        www = www[www.cnt_cited == \"US\"] \n",
    "    \n",
    "        www = www[www.period > 0]\n",
    "        www = pd.merge(www,LOC,left_on='citing',right_on='wku',how='inner'); www = www.drop('wku', axis=1)\n",
    "        www.columns = ['cited','citing','control','period','cnt_cited', 'sta_cited','cmsa_cited','cnt_citing','sta_citing','cmsa_citing']\n",
    "        www = pd.merge(www,LOC,left_on='control',right_on='wku',how='inner'); www = www.drop('wku', axis=1)\n",
    "        www.columns = ['cited','citing','control','period','cnt_cited','sta_cited','cmsa_cited','cnt_citing','sta_citing','cmsa_citing','cnt_control','sta_control','cmsa_control']\n",
    "        www = pd.merge(www,NBER_class[['wku','subcat']],left_on='cited',right_on='wku',how='inner'); www = www.drop('wku', axis=1)\n",
    "        www = www.rename(columns = {'subcat':'subcat_cited'})\n",
    "        www['x'] = 1\n",
    "        \n",
    "        ###################################\n",
    "        if NCA==1: www = www[www['sta_cited']!= 'CA']\n",
    "        ###################################\n",
    "      \n",
    "        p_hat_control_list = []\n",
    "        t_value_list = []\n",
    "\n",
    "        if z == 1:\n",
    "            \n",
    "            for location in ['cnt','sta','cmsa']:\n",
    "                \n",
    "                www['cited_equals_citing_'+location] = (www[location+'_cited']==www[location+'_citing'])\n",
    "                                \n",
    "                citing_calc = www[['x','cited_equals_citing_'+location,'sta_cited','subcat_cited']]\n",
    "                citing_calc = citing_calc.groupby(['sta_cited','subcat_cited']).sum()\n",
    "                citing_calc['p_ij'] = citing_calc['cited_equals_citing_'+location]/citing_calc['x']\n",
    "                citing_calc['w_ij'] = citing_calc['x']/(citing_calc['x'].sum())\n",
    "                citing_calc['w_p'] = citing_calc['p_ij']*citing_calc['w_ij']\n",
    "                p_hat_citing = citing_calc.sum()['w_p']\n",
    "                p_hat_citing_list.append(p_hat_citing)\n",
    "                citing_calc['ww_pp'] = (citing_calc['p_ij']-p_hat_citing)**2*citing_calc['w_ij']**2\n",
    "                SE_hat_citing = np.sqrt(citing_calc.sum()['ww_pp'])\n",
    "                SE_hat_citing_list.append(SE_hat_citing)\n",
    "                \n",
    "                \n",
    "                www['cited_equals_control_'+location] = (www[location+'_cited']==www[location+'_control'])\n",
    "                \n",
    "                control_calc = www[['x','cited_equals_control_'+location,'sta_cited','subcat_cited']]\n",
    "                control_calc = control_calc.groupby(['sta_cited','subcat_cited']).sum()\n",
    "                control_calc['p_ij'] = control_calc['cited_equals_control_'+location]/control_calc['x']\n",
    "                control_calc['w_ij'] = control_calc['x']/control_calc['x'].sum()\n",
    "                control_calc['w_p'] = control_calc['p_ij']*control_calc['w_ij']\n",
    "                p_hat_control = control_calc.sum()['w_p']\n",
    "                p_hat_control_list.append(p_hat_control)\n",
    "                control_calc['ww_pp'] = (control_calc['p_ij']-p_hat_control)**2*control_calc['w_ij']**2\n",
    "                SE_hat_control = np.sqrt(control_calc.sum()['ww_pp'])\n",
    "                \n",
    "                t_value_list.append((p_hat_citing-p_hat_control)/np.sqrt(SE_hat_citing**2+SE_hat_control**2))\n",
    "            \n",
    "            \n",
    "            Result = pd.DataFrame(columns=['','citing',Z[z-1]])\n",
    "            Result.loc[str(x)] = ['TOTAL',str(len(www)),str(len(www))]\n",
    "            Result.loc[' '] = ['country',round(p_hat_citing_list[0]*100,r), round(p_hat_control_list[0]*100,r)]\n",
    "            Result.loc['  '] = ['','', '('+ str(round(t_value_list[0],r)) +')']\n",
    "            Result.loc['   '] = ['state', round(p_hat_citing_list[1]*100,r), round(p_hat_control_list[1]*100,r)]\n",
    "            Result.loc['    '] = ['','', '('+ str(round(t_value_list[1],r)) +')']\n",
    "            Result.loc['     '] = ['CMSA',round(p_hat_citing_list[2]*100,r), round(p_hat_control_list[2]*100,r)]\n",
    "            Result.loc['      '] = ['','', '('+ str(round(t_value_list[2],r)) +')']\n",
    "            \n",
    "            result = pd.concat([result, Result], axis = 1)\n",
    "        \n",
    "        else:\n",
    "            for [i, location] in enumerate(['cnt','sta','cmsa']):\n",
    "                \n",
    "                www['cited_equals_control_'+location] = (www[location+'_cited']==www[location+'_control'])\n",
    "                \n",
    "                control_calc = www[['x','cited_equals_control_'+location,'sta_cited','subcat_cited']]\n",
    "                control_calc = control_calc.groupby(['sta_cited','subcat_cited']).sum()\n",
    "                control_calc['p_ij'] = control_calc['cited_equals_control_'+location]/control_calc['x']\n",
    "                control_calc['w_ij'] = control_calc['x']/control_calc['x'].sum()\n",
    "                control_calc['w_p'] = control_calc['p_ij']*control_calc['w_ij']\n",
    "                p_hat_control = control_calc.sum()['w_p']\n",
    "                p_hat_control_list.append(p_hat_control)\n",
    "                control_calc['ww_pp'] = (control_calc['p_ij']-p_hat_control)**2*control_calc['w_ij']**2\n",
    "                SE_hat_control = np.sqrt(control_calc.sum()['ww_pp'])\n",
    "                \n",
    "                t_value_list.append((p_hat_citing_list[i]-p_hat_control)/np.sqrt(SE_hat_citing_list[i]**2+SE_hat_control**2))\n",
    "                        \n",
    "            \n",
    "            Result = pd.DataFrame(columns=[Z[z-1]])\n",
    "            Result.loc[str(x)] = str(len(www))\n",
    "            Result.loc[' '] = [round(p_hat_control_list[0]*100,r)]\n",
    "            Result.loc['  '] = ['('+ str(round(t_value_list[0],r)) +')']\n",
    "            Result.loc['   '] = [round(p_hat_control_list[1]*100,r)]\n",
    "            Result.loc['    '] = ['('+ str(round(t_value_list[1],r)) +')']\n",
    "            Result.loc['     '] = [round(p_hat_control_list[2]*100,r)]\n",
    "            Result.loc['      '] = ['('+ str(round(t_value_list[2],r)) +')']\n",
    "\n",
    "            result = pd.concat([result, Result], axis = 1)\n",
    "        \n",
    "    Table = pd.concat([Table,result], axis = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "print(Table.to_latex(column_format='llccccc'))"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
